<h1, align=center>Building a Word2Vec Model with Twitter Data</h1>
<h3, align = center>The content was written by:</h3>
<h3, align = center>Jorge A Castañón</h3>
<h4, align=center>The code used in this tutorial is adapted from Jorge A Castañón's GitHub page.</h4>
Let's download the corpus of tweets, tweets.gz
(118 Mb)
In [1]:
import os, sys, codecs, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
from IPython.display import display
from pyspark.mllib.feature import Word2Vec
from pyspark.mllib.clustering import KMeans
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import PCA
In [3]:
def readFilters(filterpath):
filters = set()
f = open(filterpath, 'r')
for line in f:
line = codecs.decode(line, "utf-8")
line = line.strip()
if len(line) == 0: continue
filters.add(line.lower())
f.close()
return filters
def process(filters):
def realProcess(line):
key = 'text'
try:
t = json.loads(line)
if key not in t or t['lang']!='en': return None
value = t[key].lower()
# match with filters
found = False
for ff in filters:
if ff in value:
return t
break
except Exception as e:
return None
return None
return realProcess
def filterAndTrainWord2vec(datapath,filterpath,word):
filters = readFilters(filterpath)
data = sc.textFile(datapath)
totaltw = data.count()
# the next line filters tweets of interest
tweets = data.map(process(filters)).filter(lambda x: x != None).map(lambda t: t['text'])
twcount = tweets.count()
# the next line cleans unwanted characters and transform text to lower case
tweets = tweets.map(lambda x: x.replace(";"," ").replace(":"," ").replace('"',' ').replace('-',' ').replace(',',' ').replace('.',' ').lower())
# the next line breaks tweets into words
tweets = tweets.map(lambda row: row.split(" "))
## train Word2vec
word2vec = Word2Vec()
model = word2vec.fit(tweets)
## Get the list of words in the w2v matrix
vocabsize = 10000
any_word = word
tmp_list = model.findSynonyms(any_word, vocabsize-1) #setting my vocabulary size to at most 100K words
list_words = []
for l in tmp_list:
list_words.append(l[0])
list_words.append(any_word)
nwords = len(list_words)
nfeatures = model.transform(any_word).array.shape[0]
## Construct the feature matrix, each row is asociated to each word in list_words
feature_matrix = []
for word in list_words:
feature_matrix.append(model.transform(word).array)
## save W2V matrix and the list of words
np.save('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myW2Vmatrix.npy',feature_matrix)
np.save('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myWordList.npy',list_words)
print "================================================="
print "Number of total tweets processed: ", totaltw
print "================================================="
print "Number of filtered tweets used: ", twcount
print "================================================="
print "Number of words in the model:", nwords
print "================================================="
print "Number of features per word: ", nfeatures
print "================================================="
return nwords
In [4]:
def clusterWords(K):
Feat = np.load('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myW2Vmatrix.npy') # reads model generated by Word2Vec
words = np.load('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myWordList.npy') # reads list of words
Featshape = Feat.shape
Feat = sc.parallelize(Feat)
## K-means clustering with Spark
maxiters=100
clusters = KMeans.train(Feat, k = K, maxIterations = maxiters)
## Getting Cluster Labels for each Word and saving to a numpy file
labels = Feat.map(lambda point: clusters.predict(point)) # add labels to each vector (word)
list_labels = labels.collect()
np.save('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myClusters.npy',list_labels)
print "="*70
print "Size of the Word2vec matrix (words, features) is: ", Featshape
print "="*70
print "Number of clusters used: ", K
print "="*70
In [5]:
def findSimilarWords(word, nwords = 20):
Feat = np.load('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myW2Vmatrix.npy')
words = np.load('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myWordList.npy')
labels = np.load('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myClusters.npy')
Nw = words.shape[0] # total number of words
ind_star = np.where(word == words) # find index of the chosen word
wstar = Feat[ind_star,:][0][0] # vector corresponding to the chosen 'word'
nwstar = math.sqrt(np.dot(wstar,wstar)) # norm of vector corresponding to the chosen 'word'
dist = np.zeros(Nw) # initialize vector of distances
i = 0
for w in Feat: # loop to compute cosine distances
den = math.sqrt(np.dot(w,w))*nwstar # compute denominator of cosine distance
dist[i] = abs( np.dot(wstar,w) ) / den # compute cosine distance to each word
i = i + 1
indexes = np.argpartition(dist,-(nwords+1))[-(nwords+1):]
di = []
for j in range(nwords+1):
di.append(( words[indexes[j]], dist[indexes[j]], labels[indexes[j]] ) )
result = pd.DataFrame(di, columns = ["word","similarity","cluster"])
return result.iloc[::-1] # order results from closest to
In [6]:
def visualizeWords(maxWordsVis):
Feat = np.load('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myW2Vmatrix.npy')
words = np.load('/Users/jorgecastanon/Documents/github/w2v/mllib-scripts/myWordList.npy')
# to rdd, avoid this with big matrices by reading them directly from hdfs
Feat = sc.parallelize(Feat)
Feat = Feat.map(lambda vec: (Vectors.dense(vec),))
# to dataframe
dfFeat = sqlContext.createDataFrame(Feat,["features"])
## PCA to project Feature matrix to 2 dimensions
numComponents = 3
pca = PCA(k = numComponents, inputCol = "features", outputCol = "pcaFeatures")
model = pca.fit(dfFeat)
dfComp = model.transform(dfFeat).select("pcaFeatures")
# get the first two components to lists to be plotted
compX = dfComp.map(lambda vec: vec[0][0]).take(maxWordsVis)
compY = dfComp.map(lambda vec: vec[0][1]).take(maxWordsVis)
compZ = dfComp.map(lambda vec: vec[0][2]).take(maxWordsVis)
return compX, compY, compZ, words
In [7]:
pathToKeywords = '/Users/jorgecastanon/Documents/github/w2v/data/filter.txt'
pathToTweets = '/Users/jorgecastanon/Documents/github/w2v/data/tweets.gz'
word = 'christmas'
numWords = filterAndTrainWord2vec(pathToTweets,pathToKeywords,word)
In [8]:
K = int(math.floor(math.sqrt(float(numWords)/2)))
# K ~ sqrt(n/2) this is a rule of thumb for choosing K,
# where n is the number of words in the model
# feel free to choose K with a fancier algorithm
clusterWords(K)
In [9]:
res = findSimilarWords(word = 'christmas', nwords = 10)
display(res)
In [10]:
maxWordsVis = 15
compX, compY, compZ, words = visualizeWords(maxWordsVis)
In [11]:
%matplotlib inline
fs=20 #fontsize
w = words[0:maxWordsVis]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
height = 10
width = 10
fig.set_size_inches(width, height)
ax.scatter(compX, compY, compZ, color='red', s=100, marker='o', edgecolors='black')
for i, txt in enumerate(w):
ax.text(compX[i],compY[i],compZ[i], '%s' % (txt), size=8, zorder=1, color='k')
ax.set_xlabel('1st. Component', fontsize=fs)
ax.set_ylabel('2nd. Component', fontsize=fs)
ax.set_zlabel('3rd. Component', fontsize=fs)
ax.set_title('Visualization of Word2Vec via PCA', fontsize=fs)
ax.grid(True)
plt.show()
In [ ]:
In [ ]: